mort.data <- read.delim("C:/Users/rageg/Downloads/mortality3.1.txt", header = TRUE, sep = "")
#Data Cleaning
mort.data <- mort.data %>%
  # Removing the Year column as it contains integers and strings, while the
  # Year-code column provides the same information but with just the year (integer)
  select(-Year, -MultipleCauseofdeathCode) %>%
  # Renaming the Year.Code column as year
  rename("Year" = "YearCode") %>%
  # Removing the 2023 column as it is unreliable and is constantly being updated.
  rename("State" = "X.State.") %>%
  # Removing the 2023 column as it is unreliable and is constantly being updated.
  rename("% Of total Deaths" = "X.ofTotalDeaths") %>%
  # Removing the 2023 column as it is unreliable and is constantly being updated.
  filter(Year != 2023)

The variables I will focus on in this assignment are “Year”, “State”, “Population” and “Multiple Cause Of Death”. The outcome variable, or variable we will be analyzing is “Death”. Based on my data set, I believe that total deaths due to drug abuse have decreased over the years for each state due to advancing education and resources/aid in regards to narcotic abuse/addiction. I also believe that the states with the highest ratio of Narcotic death rates are in the South-West Region based on popular beliefs. And finally, I believe Heroin use everywhere has increased with the years.

#Mapping each state to a region by creating a new region column
states <- unique(mort.data$State)

state_to_region <- data.frame(
  State = states,
  Region = c(
  "Southeast", "West", "Southwest", "Southeast", "West",
  "West", "Northeast", "Northeast", "Northeast", "Southeast", "Southeast",
  "West", "West", "Midwest", "Midwest", "Midwest",
  "Midwest", "Southeast", "Southeast", "Northeast", "Northeast",
  "Northeast", "Midwest", "Midwest", "Southeast", "Midwest",
  "West", "Midwest", "West", "Northeast", "Northeast",
  "Southwest", "Northeast", "Southeast", "Midwest", "Midwest",
  "Southeast", "West", "Northeast", "Northeast", "Southeast", "Midwest",
  "Southeast", "Southeast", "West", "Northeast", "Southeast",
  "West", "Southeast", "Midwest", "West"
)
)


# Merging the original dataset with the mapping
mort.data <- merge(mort.data, state_to_region, by = "State", all.x = TRUE)

First Hypothesis: Total Deaths in relation to narcotics have decreased over the years from 1999-2020 for each state.

#First Plot: Total Deaths vs Year for Each state on the same graph, interactive.

totalDeaths.df <- mort.data %>%
  group_by(State, Year, Region) %>%
  summarise(TotalDeaths = sum(Deaths))
## `summarise()` has grouped output by 'State', 'Year'. You can override using the
## `.groups` argument.
p <- ggplot(totalDeaths.df, aes(x = Year, y = TotalDeaths, color = Region, group = State)) +
  geom_line() +
  geom_point() +
  labs(title = "Total Deaths Trendline by State (1999-2020)",
       x = "Year",
       y = "Total Deaths") +
  theme_minimal()


ggplotly(p)

On the contrary, There is a clear trend for all the states from the plot above that shows that narcotic usage has in fact significantly increased over the years for most of the states.

#STEP 4
pl <- ggplot(totalDeaths.df, aes(x = Year, y = TotalDeaths, color = Region, group = State)) +
  geom_line() +
  geom_point() +
  labs(title = "Total Deaths Trendline by State (1999-2020)",
       x = "Year",
       y = "Total Deaths") +
  theme_minimal() +
  facet_wrap(~ Region)

ggplotly(pl)

The above visuals is a subset of the Total Deaths by state vs Years. It facets the first graph into 5 graphs representing the different regions and their respective states. This makes it easier to analyze the most impact regions in regards to the data.

#STEP 5:
# Calculate slopes for each state
slopes <- totalDeaths.df %>%
  #Gets every unique state and region duo
  group_by(Region, State) %>%
  #Arrange the Year in ascending order
  arrange(Year) %>%
  #Gets the slope from the fitment of TotalDeaths vs Year
  mutate(slope = lm(TotalDeaths ~ Year)$coefficients[2])

#Identify the state with the steepest slope within each region
max_slope <- slopes %>%
  select(-Year)%>%
  group_by(Region) %>%
  slice(which.max(slope))
  

max_slope
## # A tibble: 5 × 4
## # Groups:   Region [5]
##   State      Region    TotalDeaths slope
##   <chr>      <chr>           <int> <dbl>
## 1 Ohio       Midwest           164 249. 
## 2 New York   Northeast         244 259. 
## 3 Florida    Southeast         385 206. 
## 4 Arizona    Southwest         158  65.5
## 5 California West             1653 134.

In the above table, I extracted the States which had the highest slope within each region, meaning the highest increase in deaths caused by narcotic over the same period of time(1999-2020). This is just to see which state impacts the data the most for their respective regions and the state with the highest rate of drug use for each region

selectedStates <- max_slope$State

topStates <- totalDeaths.df %>%
  filter(State %in% selectedStates)


pl3 <- ggplot(topStates, aes(x = Year, y = TotalDeaths, color = State, shape = Region)) +
  geom_line() +
  geom_point() +
  labs(title = "Total Deaths Trendline for States with Maximum Slope (1999-2020)",
       x = "Year",
       y = "Total Deaths",
       color = "State") +
  theme_minimal()

ggplotly(pl3)

The graphics I made do not support my initial hypothesis. I originally thought that the total deaths due to drugs had decreased over time, which contradicts with what my graphics are depicting.

#I will now choose The state with the minimum slope for each region-step 8

min_slope <- slopes %>%
  select(-Year)%>%
  group_by(Region) %>%
  slice(which.min(slope))
min_slope
## # A tibble: 5 × 4
## # Groups:   Region [5]
##   State        Region    TotalDeaths slope
##   <chr>        <chr>           <int> <dbl>
## 1 South Dakota Midwest             0  1.78
## 2 Vermont      Northeast          16  8.54
## 3 Arkansas     Southeast          10 10.3 
## 4 New Mexico   Southwest         202 19.7 
## 5 Hawaii       West               25  2.11
selectedStates2 <- min_slope$State

botStates <- totalDeaths.df %>%
  filter(State %in% selectedStates2)

pl4 <- ggplot(botStates, aes(x = Year, y = TotalDeaths, color = State, shape = Region)) +
  geom_line() +
  geom_point() +
  labs(title = "Total Deaths Trendline for States with Minimum Slope (1999-2020)",
       x = "Year",
       y = "Total Deaths",
       color = "State") +
  theme_minimal()

ggplotly(pl4)

Changing the highlighted observations does indeed change the way you think about the data. From the two different graphics, incorrect assumptions can be made. for example, In the max_slope graphic, the state with with the lowest increase in death with respect to the others is in the Southwest region, while in the min_slope graphic the state with the highest increase in death with respect to the others is also in the Southwest region.At first glance, this may be very confusing and some might draw the conclusion from the min_slope graph that the southwest has a lot of drug use, but that may be proven to be the a contradiction when observing the max_slope graphic, as SW in that one has the least slope between the maxes.

One of the main big takeaways from the above analysis is that deaths due to drug/narcotic abuse has only been increasing in recent years, contrary to beliefs. This is backed up using visuals that clearly depicts upwards trend for drug related deaths for the US, regions within the US, and more specifically states within the US.From focusing on just a few observations or a group of observations, data can easily be manipulated to convey conflicting ideas. On the contrary, also from doing that, hypothesis can be further emphasized to be correct.

# Assuming the variable "Year" is numeric, you can set breaks at 2000 and 2011
breaks <- c(2000, 2011, Inf)

# Create a new variable to categorize the years into two groups
topStates <- topStates %>%
  mutate(YearGroup = cut(Year, breaks = breaks, labels = c("2000-2011", "2011-2020"), include.lowest = TRUE))

# Plot the trendline with shaded regions
pl3 <- ggplot(topStates, aes(x = Year, y = TotalDeaths, color = State, shape = Region)) +
  geom_line() +
  geom_point() +
  geom_ribbon(aes(ymin = 0, ymax = TotalDeaths, fill = YearGroup), alpha = 0.2) +
  labs(title = "Total Deaths Trendline for States with Maximum Slope (1999-2020)",
       x = "Year",
       y = "Total Deaths",
       color = "State") +
  theme_minimal() +
  scale_fill_manual(values = c("2000-2011" = "lightblue", "2011-2020" = "lightgreen")) +
  guides(fill = guide_legend(title = "Year Group")) +
  annotate("text", x = 2005, y = max(topStates$TotalDeaths), label = "2000-2011", size = 4, color = "black") +
  annotate("text", x = 2015, y = max(topStates$TotalDeaths), label = "2011-2020", size = 4, color = "black")

ggplotly(pl3)